# author: doojin
# 2012.08.15

import re
import csv
from bs4 import BeautifulSoup

def Convert(html):
  """
  html: unicode.
  returns:
    list of dictinary (utf8)
    'hanja', 'pronan', 'meaning', 'stroke',
    'five_element'
  """
  def _make_hanja(hanja, pronan, meaning, stroke, five_element):
    return {'hanja': hanja.encode('utf8'),
            'pronan': pronan.encode('utf8'),
            'meaning': meaning.encode('utf8'),
            'stroke': stroke.encode('utf8'),
            'five_element': five_element.encode('utf8')}

  hanja_list = []

  soup = BeautifulSoup(html)

  # find table which has 
  td = soup.find('td', attrs={'class': 'hanja'})
  table = td.parent.parent.parent.parent.parent
  contents = []
  for content in table.find_all('table'):
    if content.find('td',attrs={'class': 'hanja'}):
      contents.append(content)
  for content in contents:
    hanja = unicode(content.find('td', attrs={'class': 'hanja'}).string)
    hangul = unicode(content.find('span', attrs={'class': 'hangul'}).string)
    if hangul.split() > 1:
      pronan = hangul.split()[-1]
      meaning = u" ".join(hangul.split()[:-1])
    else:
      pronan = hangul[-1]
      meaning = u"".join(hangul[:-1])
    info = content.find('td', style='line-height:2.0;')
    assert 3 == len(info.contents), 'hangul, stroke, five-element'
    stroke = re.findall('\d+', info.contents[1])[0]
    five_element = re.findall(u'\uc790\uc6d0\uc624\ud589 : (\S+)',
         info.contents[2].br.contents[0])[0]
    hanja_list.append(_make_hanja(
        hanja, pronan, meaning, stroke, five_element))

  return hanja_list

def GetHtml(filename):
  """ return: unicode. """
  # the encoding of original file is 'utf8'
  return open(filename).read().decode(
      'utf8', 'ignore')


def OutputCsv(filename, hanja_list):
  """
  filename: string,
  hanja_list:
    list of dictinary
    'hanja', 'pronan', 'meaning', 'stroke',
    'five_element'
  """
  # convert to utf8
  w = csv.DictWriter(
    open(filename, 'w'),
    ['hanja', 'pronan', 'meaning', 'stroke', 'five_element'])
  w.writerows(hanja_list)


def main():
  hanja_list = [] 
  for i in range(0, 4821, 20):
    print i
    filename = '../raw_data/hanja_%d.html' % i
    html = GetHtml(filename)
    hanja_list += Convert(html)
  OutputCsv('../data/hanja.csv', hanja_list)
  

if __name__ == '__main__':
  main()  
